## imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import PorterStemmer
import re
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Arunachala\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Arunachala\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
True
from nltk.corpus import stopwords
df = pd.read_csv('data/train.csv', usecols=['text', 'label'])
df["label"] = [str(i) for i in df["label"]]
df.head()
| text | label | |
|---|---|---|
| 0 | i have to try but i am worried i had tried a d... | 7 |
| 1 | nice biryani | 2 |
| 2 | aap bahut aache se samjhati h thanks me ye rec... | 5 |
| 3 | thanks so much nisha ji | 1 |
| 4 | Achha bana tha mere wife anda or kalimirch v a... | 6 |
# setting our preprocessing standards
words = []
for l in list(df["text"]):
words.extend(l.split())
vocab_size = len(set(words))
# cleaning the text
stemmer = PorterStemmer()
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean(text):
# convert text to lower
lower = text.lower()
# replace REPLACE_BY_SPACE_RE symbols by space in lower
space_replace = REPLACE_BY_SPACE_RE.sub(' ', lower)
# delete symbols which are in BAD_SYMBOLS_RE from text
nosymb = BAD_SYMBOLS_RE.sub('', space_replace)
# delete stopwords from nosymb
clean_text = ' '.join(word for word in nosymb.split() if word not in STOPWORDS)
# removes any 1 letter words in clean_text
text = ' '.join(word for word in clean_text.split() if (len(word) >= 2))
# stemming (removing prefixes and suffixes)
text = ' '.join([stemmer.stem(word) for word in text.split()])
return text
df["text"] = df["text"].apply(clean)
df["text"][:5]
0 tri worri tri differ recip biryani chicken did... 1 nice biryani 2 aap bahut aach se samjhati thank ye recipi tri... 3 thank much nisha ji 4 achha bana tha mere wife anda kalimirch add ki... Name: text, dtype: object
# checking if samples are balances
df["label"].value_counts()
1 1142 6 1129 3 1127 5 1123 7 1121 4 1106 2 1092 Name: label, dtype: int64
# numbers show that no class has excess or too few sample
# therefore we dont have to worry about under or ovr sampling
# plotting a graph to visualize the same
count_df = df.groupby('label')["text"].count()
fig = go.Figure([go.Bar(x=count_df.keys(), y=count_df.values)])
fig.show()
# train-test split
comments = df["text"].values
labels = df[["label"]].values
x_train, x_test, y_train, y_test = train_test_split(comments,labels, test_size = 0.20, random_state = 42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)
(6272,) (6272, 1) (1568,) (1568, 1)
# tokenizing the comments
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
print(f'number of unique tokens in x_train: {len(word_index)}')
number of unique tokens in x_train: 5950
# using our tokenizer, converting texts to sequences
# following that we pad/truncate the x_train/test depending on its length so length of all sequences are the same
x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=1000, padding='post', truncating='post')
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=1000, padding='post', truncating='post')
print('Shape of data tensor:', x_train.shape)
print('Shape of data tensor:', x_test .shape)
Shape of data tensor: (6272, 1000) Shape of data tensor: (1568, 1000)
# encoding the categories
encode = OneHotEncoder()
y_train = encode.fit_transform(y_train)
y_test = encode.transform(y_test)
y_train = y_train.toarray()
y_test = y_test.toarray()
# Check categories
print(encode.categories_)
[array(['1', '2', '3', '4', '5', '6', '7'], dtype=object)]
# checking data if its ready to use (i.e., if sizes and types match)
print(x_train.shape)
print(y_test.shape)
print(x_test.shape)
print(y_train.shape)
print(type(x_train))
print(type(x_test))
print(type(y_train))
print(type(y_test))
(6272, 1000) (1568, 7) (1568, 1000) (6272, 7) <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
# we want to stop training our nn if the validation score starts plateuing
reduce_lr_accuracy = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=2, min_lr=0.0005)
early_stop_accuracy = EarlyStopping(monitor='val_accuracy', mode='max', patience=2, verbose=1)
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0005)
early_stop_loss = EarlyStopping(monitor='val_loss', mode='min', patience=2, verbose=1)
# creating a parameter grid over which we will find best set of parameters
param_grid = {
"filters": (60, 70, 80),
"activation": ("relu", "tanh"),
"embedding_dim": (64, 128,),
"batch_size": (32, 48,)
}
grid = ParameterGrid(param_grid)
def compile_model(p, x_train = x_train):
model = Sequential()
model.add(
Embedding(
vocab_size,
p["embedding_dim"],
input_length=x_train.shape[1]
)
)
model.add(
Conv1D(
filters=p["filters"],
kernel_size=2,
activation=p["activation"],
padding='valid',
)
)
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def hyperparameter_tuning(grid, x_train = x_train, y_train=y_train, x_test=x_test, y_test=y_test):
parameters = pd.DataFrame(columns=["params", "accuracy"])
for p in grid:
model = compile_model(p, x_train)
model.fit(x_train, y_train, shuffle=True ,
epochs=10, batch_size=p["batch_size"],
validation_split=0.2,
callbacks=[reduce_lr_accuracy, reduce_lr_loss, early_stop_loss, early_stop_accuracy])
y_pred = model.predict(x_test)
accuracy = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1))
parameters = parameters.append(
{"params": p, "accuracy": accuracy}, ignore_index=True
)
return parameters
Note: Do not run the following cell everytime (it takes around 30 min to train all combinations). A csv file (hyper_parameter_tuning_results.csv) is saved in the outputs directory.
params = hyperparameter_tuning(grid)
# params = pd.read_csv("output/hyper_paramerter_tuning_results.csv")
Epoch 1/10 157/157 [==============================] - 7s 39ms/step - loss: 1.7038 - accuracy: 0.4273 - val_loss: 1.2353 - val_accuracy: 0.6598 Epoch 2/10 157/157 [==============================] - 9s 57ms/step - loss: 1.0622 - accuracy: 0.6637 - val_loss: 0.8134 - val_accuracy: 0.7649 Epoch 3/10 157/157 [==============================] - 11s 69ms/step - loss: 0.7958 - accuracy: 0.7528 - val_loss: 0.6865 - val_accuracy: 0.7801 Epoch 4/10 157/157 [==============================] - 11s 71ms/step - loss: 0.6384 - accuracy: 0.8065 - val_loss: 0.6386 - val_accuracy: 0.7825 Epoch 5/10 157/157 [==============================] - 9s 58ms/step - loss: 0.5248 - accuracy: 0.8523 - val_loss: 0.6222 - val_accuracy: 0.7920 Epoch 6/10 157/157 [==============================] - 9s 55ms/step - loss: 0.4599 - accuracy: 0.8714 - val_loss: 0.6163 - val_accuracy: 0.7904 Epoch 7/10 157/157 [==============================] - 7s 47ms/step - loss: 0.3953 - accuracy: 0.8908 - val_loss: 0.6168 - val_accuracy: 0.7920 Epoch 00007: early stopping Epoch 1/10 157/157 [==============================] - 12s 69ms/step - loss: 1.7147 - accuracy: 0.4128 - val_loss: 1.2466 - val_accuracy: 0.6359 Epoch 2/10 157/157 [==============================] - 10s 66ms/step - loss: 1.0641 - accuracy: 0.6542 - val_loss: 0.7826 - val_accuracy: 0.7625 Epoch 3/10 157/157 [==============================] - 11s 68ms/step - loss: 0.7795 - accuracy: 0.7546 - val_loss: 0.6658 - val_accuracy: 0.7833 Epoch 4/10 157/157 [==============================] - 11s 72ms/step - loss: 0.6224 - accuracy: 0.8136 - val_loss: 0.6131 - val_accuracy: 0.7984 Epoch 5/10 157/157 [==============================] - 9s 55ms/step - loss: 0.5055 - accuracy: 0.8505 - val_loss: 0.6033 - val_accuracy: 0.7968 Epoch 6/10 157/157 [==============================] - 9s 55ms/step - loss: 0.4335 - accuracy: 0.8752 - val_loss: 0.6009 - val_accuracy: 0.7928 Epoch 00006: early stopping Epoch 1/10 157/157 [==============================] - 11s 60ms/step - loss: 1.6867 - accuracy: 0.4457 - val_loss: 1.2045 - val_accuracy: 0.6629 Epoch 2/10 157/157 [==============================] - 8s 53ms/step - loss: 1.0189 - accuracy: 0.6741 - val_loss: 0.7564 - val_accuracy: 0.7673 Epoch 3/10 157/157 [==============================] - 8s 52ms/step - loss: 0.7396 - accuracy: 0.7640 - val_loss: 0.6456 - val_accuracy: 0.7857 Epoch 4/10 157/157 [==============================] - 8s 52ms/step - loss: 0.5871 - accuracy: 0.8308 - val_loss: 0.6177 - val_accuracy: 0.7936 Epoch 5/10 157/157 [==============================] - 8s 52ms/step - loss: 0.4824 - accuracy: 0.8561 - val_loss: 0.5988 - val_accuracy: 0.7912 Epoch 6/10 157/157 [==============================] - 8s 52ms/step - loss: 0.4056 - accuracy: 0.8828 - val_loss: 0.5983 - val_accuracy: 0.7896 Epoch 00006: early stopping Epoch 1/10 157/157 [==============================] - 18s 107ms/step - loss: 1.6243 - accuracy: 0.4459 - val_loss: 1.1418 - val_accuracy: 0.6964- ETA - ETA: 0s - loss: 1.6296 - accuracy Epoch 2/10 157/157 [==============================] - 15s 96ms/step - loss: 1.0122 - accuracy: 0.6831 - val_loss: 0.7555 - val_accuracy: 0.7729 Epoch 3/10 157/157 [==============================] - 15s 98ms/step - loss: 0.7376 - accuracy: 0.7686 - val_loss: 0.6495 - val_accuracy: 0.7849 Epoch 4/10 157/157 [==============================] - 16s 99ms/step - loss: 0.5891 - accuracy: 0.8290 - val_loss: 0.6183 - val_accuracy: 0.7952 Epoch 5/10 157/157 [==============================] - 15s 97ms/step - loss: 0.4915 - accuracy: 0.8563 - val_loss: 0.6110 - val_accuracy: 0.7944 Epoch 6/10 157/157 [==============================] - 16s 103ms/step - loss: 0.4014 - accuracy: 0.8858 - val_loss: 0.6110 - val_accuracy: 0.7944 Epoch 00006: early stopping Epoch 1/10 157/157 [==============================] - 19s 117ms/step - loss: 1.6150 - accuracy: 0.4648 - val_loss: 1.0663 - val_accuracy: 0.6876 Epoch 2/10 157/157 [==============================] - 18s 114ms/step - loss: 0.9549 - accuracy: 0.6960 - val_loss: 0.7131 - val_accuracy: 0.7769 Epoch 3/10 157/157 [==============================] - 18s 113ms/step - loss: 0.6805 - accuracy: 0.7941 - val_loss: 0.6265 - val_accuracy: 0.7936 Epoch 4/10 157/157 [==============================] - 17s 111ms/step - loss: 0.5337 - accuracy: 0.8395 - val_loss: 0.6104 - val_accuracy: 0.7944 Epoch 5/10 157/157 [==============================] - 18s 114ms/step - loss: 0.4359 - accuracy: 0.8776 - val_loss: 0.5937 - val_accuracy: 0.7976 Epoch 6/10 157/157 [==============================] - 17s 111ms/step - loss: 0.3614 - accuracy: 0.8981 - val_loss: 0.6027 - val_accuracy: 0.7888 Epoch 7/10 157/157 [==============================] - 18s 115ms/step - loss: 0.3318 - accuracy: 0.9113 - val_loss: 0.6110 - val_accuracy: 0.7888 Epoch 00007: early stopping Epoch 00007: early stopping Epoch 1/10 157/157 [==============================] - 18s 107ms/step - loss: 1.6204 - accuracy: 0.4497 - val_loss: 1.0943 - val_accuracy: 0.7052 Epoch 2/10 157/157 [==============================] - 15s 93ms/step - loss: 0.9393 - accuracy: 0.7024 - val_loss: 0.7116 - val_accuracy: 0.7753 Epoch 3/10 157/157 [==============================] - 16s 100ms/step - loss: 0.6772 - accuracy: 0.7899 - val_loss: 0.6323 - val_accuracy: 0.7880- ETA: 1s - loss: 0 Epoch 4/10 157/157 [==============================] - 15s 96ms/step - loss: 0.5148 - accuracy: 0.8525 - val_loss: 0.6069 - val_accuracy: 0.7944 Epoch 5/10 157/157 [==============================] - 16s 99ms/step - loss: 0.4254 - accuracy: 0.8788 - val_loss: 0.6078 - val_accuracy: 0.7952 Epoch 6/10 157/157 [==============================] - 16s 103ms/step - loss: 0.3465 - accuracy: 0.9009 - val_loss: 0.6132 - val_accuracy: 0.7920 Epoch 00006: early stopping Epoch 1/10 105/105 [==============================] - 8s 74ms/step - loss: 1.7997 - accuracy: 0.3887 - val_loss: 1.4897 - val_accuracy: 0.5291 Epoch 2/10 105/105 [==============================] - 7s 69ms/step - loss: 1.2506 - accuracy: 0.5956 - val_loss: 0.9537 - val_accuracy: 0.7315 Epoch 3/10 105/105 [==============================] - 7s 68ms/step - loss: 0.9057 - accuracy: 0.7198 - val_loss: 0.7461 - val_accuracy: 0.7705 Epoch 4/10 105/105 [==============================] - 7s 63ms/step - loss: 0.7154 - accuracy: 0.7837 - val_loss: 0.6642 - val_accuracy: 0.7769 Epoch 5/10 105/105 [==============================] - 6s 61ms/step - loss: 0.6096 - accuracy: 0.8194 - val_loss: 0.6274 - val_accuracy: 0.7873 Epoch 6/10 105/105 [==============================] - 7s 65ms/step - loss: 0.5251 - accuracy: 0.8507 - val_loss: 0.6158 - val_accuracy: 0.7896 Epoch 7/10 105/105 [==============================] - 7s 68ms/step - loss: 0.4492 - accuracy: 0.8708 - val_loss: 0.6053 - val_accuracy: 0.7880 Epoch 8/10 105/105 [==============================] - 7s 68ms/step - loss: 0.3974 - accuracy: 0.8880 - val_loss: 0.6048 - val_accuracy: 0.7857 Epoch 00008: early stopping Epoch 1/10 105/105 [==============================] - 9s 81ms/step - loss: 1.8024 - accuracy: 0.3632 - val_loss: 1.5118 - val_accuracy: 0.6215 Epoch 2/10 105/105 [==============================] - 8s 73ms/step - loss: 1.2735 - accuracy: 0.6049 - val_loss: 0.9480 - val_accuracy: 0.7227 Epoch 3/10 105/105 [==============================] - 8s 72ms/step - loss: 0.8893 - accuracy: 0.7100 - val_loss: 0.7281 - val_accuracy: 0.7641 Epoch 4/10 105/105 [==============================] - 8s 72ms/step - loss: 0.6992 - accuracy: 0.7877 - val_loss: 0.6475 - val_accuracy: 0.7809 Epoch 5/10 105/105 [==============================] - 9s 81ms/step - loss: 0.5845 - accuracy: 0.8278 - val_loss: 0.6124 - val_accuracy: 0.7936 Epoch 6/10 105/105 [==============================] - 9s 90ms/step - loss: 0.4842 - accuracy: 0.8635 - val_loss: 0.5949 - val_accuracy: 0.7960 Epoch 7/10 105/105 [==============================] - 8s 80ms/step - loss: 0.4304 - accuracy: 0.8796 - val_loss: 0.5933 - val_accuracy: 0.7904 Epoch 8/10 105/105 [==============================] - 8s 72ms/step - loss: 0.3860 - accuracy: 0.8934 - val_loss: 0.5948 - val_accuracy: 0.7904 Epoch 00008: early stopping Epoch 1/10 105/105 [==============================] - 9s 81ms/step - loss: 1.7747 - accuracy: 0.4074 - val_loss: 1.4196 - val_accuracy: 0.5912 Epoch 2/10 105/105 [==============================] - 8s 81ms/step - loss: 1.1857 - accuracy: 0.6205 - val_loss: 0.8917 - val_accuracy: 0.7474 Epoch 3/10 105/105 [==============================] - 8s 76ms/step - loss: 0.8462 - accuracy: 0.7297 - val_loss: 0.7102 - val_accuracy: 0.7753 Epoch 4/10 105/105 [==============================] - 8s 79ms/step - loss: 0.6777 - accuracy: 0.7957 - val_loss: 0.6408 - val_accuracy: 0.7960 Epoch 5/10 105/105 [==============================] - 9s 91ms/step - loss: 0.5444 - accuracy: 0.8383 - val_loss: 0.6062 - val_accuracy: 0.7920 Epoch 6/10 105/105 [==============================] - 10s 93ms/step - loss: 0.4594 - accuracy: 0.8676 - val_loss: 0.6040 - val_accuracy: 0.7928 Epoch 00006: early stopping Epoch 1/10 105/105 [==============================] - 16s 144ms/step - loss: 1.6998 - accuracy: 0.4339 - val_loss: 1.2611 - val_accuracy: 0.6685 Epoch 2/10 105/105 [==============================] - 15s 141ms/step - loss: 1.0880 - accuracy: 0.6627 - val_loss: 0.8185 - val_accuracy: 0.7450 Epoch 3/10 105/105 [==============================] - 14s 132ms/step - loss: 0.8107 - accuracy: 0.7536 - val_loss: 0.6822 - val_accuracy: 0.7944 Epoch 4/10 105/105 [==============================] - 13s 118ms/step - loss: 0.6265 - accuracy: 0.8188 - val_loss: 0.6286 - val_accuracy: 0.7944 Epoch 5/10 105/105 [==============================] - 13s 120ms/step - loss: 0.5291 - accuracy: 0.8495 - val_loss: 0.6094 - val_accuracy: 0.8024 Epoch 6/10 105/105 [==============================] - 12s 115ms/step - loss: 0.4421 - accuracy: 0.8786 - val_loss: 0.6167 - val_accuracy: 0.7976 Epoch 7/10 105/105 [==============================] - 12s 110ms/step - loss: 0.3838 - accuracy: 0.8928 - val_loss: 0.6263 - val_accuracy: 0.7896 Epoch 00007: early stopping Epoch 00007: early stopping Epoch 1/10 105/105 [==============================] - 15s 135ms/step - loss: 1.7421 - accuracy: 0.4072 - val_loss: 1.3468 - val_accuracy: 0.6359 Epoch 2/10 105/105 [==============================] - 15s 143ms/step - loss: 1.1340 - accuracy: 0.6434 - val_loss: 0.8398 - val_accuracy: 0.7530 Epoch 3/10 105/105 [==============================] - 14s 132ms/step - loss: 0.8029 - accuracy: 0.7512 - val_loss: 0.6890 - val_accuracy: 0.7793 Epoch 4/10 105/105 [==============================] - 14s 137ms/step - loss: 0.6215 - accuracy: 0.8146 - val_loss: 0.6362 - val_accuracy: 0.7809 Epoch 5/10 105/105 [==============================] - 13s 128ms/step - loss: 0.5055 - accuracy: 0.8501 - val_loss: 0.6163 - val_accuracy: 0.7833 Epoch 6/10 105/105 [==============================] - 15s 147ms/step - loss: 0.4260 - accuracy: 0.8822 - val_loss: 0.6185 - val_accuracy: 0.7857 Epoch 7/10 105/105 [==============================] - 15s 144ms/step - loss: 0.3621 - accuracy: 0.8983 - val_loss: 0.6307 - val_accuracy: 0.7888 Epoch 00007: early stopping Epoch 1/10 105/105 [==============================] - 16s 149ms/step - loss: 1.6607 - accuracy: 0.4323 - val_loss: 1.1866 - val_accuracy: 0.6701 Epoch 2/10 105/105 [==============================] - 15s 142ms/step - loss: 1.0044 - accuracy: 0.6877 - val_loss: 0.7557 - val_accuracy: 0.7649 Epoch 3/10 105/105 [==============================] - 15s 139ms/step - loss: 0.7330 - accuracy: 0.7702 - val_loss: 0.6548 - val_accuracy: 0.7721 Epoch 4/10 105/105 [==============================] - 14s 137ms/step - loss: 0.5786 - accuracy: 0.8274 - val_loss: 0.6181 - val_accuracy: 0.7857 Epoch 5/10 105/105 [==============================] - 14s 132ms/step - loss: 0.4747 - accuracy: 0.8601 - val_loss: 0.5982 - val_accuracy: 0.7920 Epoch 6/10 105/105 [==============================] - 14s 132ms/step - loss: 0.4016 - accuracy: 0.8870 - val_loss: 0.5983 - val_accuracy: 0.7960 Epoch 7/10 105/105 [==============================] - 14s 132ms/step - loss: 0.3464 - accuracy: 0.9037 - val_loss: 0.6201 - val_accuracy: 0.7841 Epoch 00007: early stopping Epoch 1/10 157/157 [==============================] - 8s 49ms/step - loss: 1.6740 - accuracy: 0.4258 - val_loss: 1.1851 - val_accuracy: 0.6470 Epoch 2/10 157/157 [==============================] - 8s 48ms/step - loss: 1.0014 - accuracy: 0.6789 - val_loss: 0.7755 - val_accuracy: 0.7721 Epoch 3/10 157/157 [==============================] - 7s 46ms/step - loss: 0.7037 - accuracy: 0.7780 - val_loss: 0.6476 - val_accuracy: 0.7825 Epoch 4/10 157/157 [==============================] - 7s 45ms/step - loss: 0.5243 - accuracy: 0.8413 - val_loss: 0.6031 - val_accuracy: 0.7888 Epoch 5/10 157/157 [==============================] - 8s 49ms/step - loss: 0.4110 - accuracy: 0.8808 - val_loss: 0.5956 - val_accuracy: 0.7952 Epoch 6/10 157/157 [==============================] - 7s 46ms/step - loss: 0.3206 - accuracy: 0.9107 - val_loss: 0.5978 - val_accuracy: 0.7920 Epoch 7/10 157/157 [==============================] - 7s 44ms/step - loss: 0.2779 - accuracy: 0.9243 - val_loss: 0.6007 - val_accuracy: 0.7920 Epoch 00007: early stopping Epoch 00007: early stopping Epoch 1/10 157/157 [==============================] - 9s 54ms/step - loss: 1.6888 - accuracy: 0.4168 - val_loss: 1.2023 - val_accuracy: 0.6574 Epoch 2/10 157/157 [==============================] - 8s 52ms/step - loss: 0.9880 - accuracy: 0.6869 - val_loss: 0.7570 - val_accuracy: 0.7506 Epoch 3/10 157/157 [==============================] - 8s 51ms/step - loss: 0.6728 - accuracy: 0.7837 - val_loss: 0.6520 - val_accuracy: 0.7649 Epoch 4/10 157/157 [==============================] - 8s 49ms/step - loss: 0.5050 - accuracy: 0.8487 - val_loss: 0.6191 - val_accuracy: 0.7833 Epoch 5/10 157/157 [==============================] - 8s 52ms/step - loss: 0.3827 - accuracy: 0.8912 - val_loss: 0.6104 - val_accuracy: 0.7888 Epoch 6/10 157/157 [==============================] - 8s 53ms/step - loss: 0.3146 - accuracy: 0.9115 - val_loss: 0.6174 - val_accuracy: 0.7896 Epoch 7/10 157/157 [==============================] - 8s 54ms/step - loss: 0.2594 - accuracy: 0.9288 - val_loss: 0.6355 - val_accuracy: 0.7785 Epoch 00007: early stopping Epoch 1/10 157/157 [==============================] - 10s 58ms/step - loss: 1.5998 - accuracy: 0.4439 - val_loss: 1.1179 - val_accuracy: 0.6159 Epoch 2/10 157/157 [==============================] - 9s 57ms/step - loss: 0.9494 - accuracy: 0.6805 - val_loss: 0.7425 - val_accuracy: 0.7618 Epoch 3/10 157/157 [==============================] - 9s 55ms/step - loss: 0.6457 - accuracy: 0.7947 - val_loss: 0.6366 - val_accuracy: 0.7849 Epoch 4/10 157/157 [==============================] - 8s 54ms/step - loss: 0.4810 - accuracy: 0.8577 - val_loss: 0.6018 - val_accuracy: 0.7928 Epoch 5/10 157/157 [==============================] - 8s 51ms/step - loss: 0.3687 - accuracy: 0.8922 - val_loss: 0.5949 - val_accuracy: 0.7857 Epoch 6/10 157/157 [==============================] - 9s 54ms/step - loss: 0.2899 - accuracy: 0.9181 - val_loss: 0.5993 - val_accuracy: 0.7801 Epoch 00006: early stopping Epoch 1/10 157/157 [==============================] - 15s 87ms/step - loss: 1.5635 - accuracy: 0.4509 - val_loss: 1.0465 - val_accuracy: 0.6908 Epoch 2/10 157/157 [==============================] - 13s 85ms/step - loss: 0.8932 - accuracy: 0.7156 - val_loss: 0.6955 - val_accuracy: 0.7753 Epoch 3/10 157/157 [==============================] - 14s 92ms/step - loss: 0.5957 - accuracy: 0.8158 - val_loss: 0.6096 - val_accuracy: 0.7849 Epoch 4/10 157/157 [==============================] - 16s 99ms/step - loss: 0.4286 - accuracy: 0.8742 - val_loss: 0.5894 - val_accuracy: 0.7833 Epoch 5/10 157/157 [==============================] - 13s 84ms/step - loss: 0.3207 - accuracy: 0.9161 - val_loss: 0.5925 - val_accuracy: 0.7888 Epoch 6/10 157/157 [==============================] - 13s 82ms/step - loss: 0.2584 - accuracy: 0.9282 - val_loss: 0.6043 - val_accuracy: 0.7936 Epoch 00006: early stopping Epoch 1/10 157/157 [==============================] - 16s 96ms/step - loss: 1.5443 - accuracy: 0.4666 - val_loss: 1.0331 - val_accuracy: 0.6693 Epoch 2/10 157/157 [==============================] - 15s 96ms/step - loss: 0.8729 - accuracy: 0.7204 - val_loss: 0.6964 - val_accuracy: 0.7729 Epoch 3/10 157/157 [==============================] - 15s 96ms/step - loss: 0.5698 - accuracy: 0.8262 - val_loss: 0.6265 - val_accuracy: 0.7785 Epoch 4/10 157/157 [==============================] - 15s 98ms/step - loss: 0.4074 - accuracy: 0.8824 - val_loss: 0.6091 - val_accuracy: 0.7785 Epoch 5/10 157/157 [==============================] - 15s 97ms/step - loss: 0.3075 - accuracy: 0.9109 - val_loss: 0.6237 - val_accuracy: 0.7737 Epoch 00005: early stopping Epoch 1/10 157/157 [==============================] - 16s 98ms/step - loss: 1.5384 - accuracy: 0.4724 - val_loss: 0.9838 - val_accuracy: 0.6773 Epoch 2/10 157/157 [==============================] - 15s 95ms/step - loss: 0.8439 - accuracy: 0.7156 - val_loss: 0.6695 - val_accuracy: 0.7689 Epoch 3/10 157/157 [==============================] - 14s 92ms/step - loss: 0.5620 - accuracy: 0.8214 - val_loss: 0.6017 - val_accuracy: 0.7912 Epoch 4/10 157/157 [==============================] - 15s 97ms/step - loss: 0.4074 - accuracy: 0.8790 - val_loss: 0.5847 - val_accuracy: 0.7968 Epoch 5/10 157/157 [==============================] - 15s 94ms/step - loss: 0.3032 - accuracy: 0.9169 - val_loss: 0.5863 - val_accuracy: 0.7968 Epoch 6/10 157/157 [==============================] - 16s 101ms/step - loss: 0.2379 - accuracy: 0.9352 - val_loss: 0.6030 - val_accuracy: 0.7968 Epoch 00006: early stopping Epoch 00006: early stopping Epoch 1/10 105/105 [==============================] - 8s 70ms/step - loss: 1.7806 - accuracy: 0.3723 - val_loss: 1.4297 - val_accuracy: 0.5602 Epoch 2/10 105/105 [==============================] - 7s 67ms/step - loss: 1.1891 - accuracy: 0.6197 - val_loss: 0.9078 - val_accuracy: 0.7036 Epoch 3/10 105/105 [==============================] - 7s 66ms/step - loss: 0.8220 - accuracy: 0.7377 - val_loss: 0.7171 - val_accuracy: 0.7594 Epoch 4/10 105/105 [==============================] - 7s 66ms/step - loss: 0.6231 - accuracy: 0.8083 - val_loss: 0.6413 - val_accuracy: 0.7865 Epoch 5/10 105/105 [==============================] - 7s 66ms/step - loss: 0.4839 - accuracy: 0.8621 - val_loss: 0.6077 - val_accuracy: 0.7888 Epoch 6/10 105/105 [==============================] - 7s 66ms/step - loss: 0.3900 - accuracy: 0.8912 - val_loss: 0.5991 - val_accuracy: 0.7920 Epoch 7/10 105/105 [==============================] - 7s 63ms/step - loss: 0.3298 - accuracy: 0.9075 - val_loss: 0.5922 - val_accuracy: 0.7952 Epoch 8/10 105/105 [==============================] - 7s 63ms/step - loss: 0.2877 - accuracy: 0.9211 - val_loss: 0.5944 - val_accuracy: 0.7968 Epoch 9/10 105/105 [==============================] - 7s 66ms/step - loss: 0.2405 - accuracy: 0.9354 - val_loss: 0.5997 - val_accuracy: 0.7936 Epoch 00009: early stopping Epoch 1/10 105/105 [==============================] - 10s 86ms/step - loss: 1.7305 - accuracy: 0.3911 - val_loss: 1.3524 - val_accuracy: 0.5163 Epoch 2/10 105/105 [==============================] - 9s 81ms/step - loss: 1.1450 - accuracy: 0.6137 - val_loss: 0.9160 - val_accuracy: 0.7100 Epoch 3/10 105/105 [==============================] - 8s 79ms/step - loss: 0.8186 - accuracy: 0.7317 - val_loss: 0.7221 - val_accuracy: 0.7610 Epoch 4/10 105/105 [==============================] - 8s 76ms/step - loss: 0.6030 - accuracy: 0.8160 - val_loss: 0.6379 - val_accuracy: 0.7833 Epoch 5/10 105/105 [==============================] - 8s 72ms/step - loss: 0.4645 - accuracy: 0.8659 - val_loss: 0.6114 - val_accuracy: 0.7865 Epoch 6/10 105/105 [==============================] - 8s 79ms/step - loss: 0.3672 - accuracy: 0.8971 - val_loss: 0.5967 - val_accuracy: 0.7920 Epoch 7/10 105/105 [==============================] - 8s 75ms/step - loss: 0.3018 - accuracy: 0.9145 - val_loss: 0.6029 - val_accuracy: 0.7857 Epoch 8/10 105/105 [==============================] - 8s 73ms/step - loss: 0.2652 - accuracy: 0.9276 - val_loss: 0.6117 - val_accuracy: 0.7825 Epoch 00008: early stopping Epoch 00008: early stopping Epoch 1/10 105/105 [==============================] - 10s 92ms/step - loss: 1.7029 - accuracy: 0.4218 - val_loss: 1.2804 - val_accuracy: 0.6064 Epoch 2/10 105/105 [==============================] - 9s 87ms/step - loss: 1.0800 - accuracy: 0.6396 - val_loss: 0.8441 - val_accuracy: 0.7235 Epoch 3/10 105/105 [==============================] - 9s 85ms/step - loss: 0.7501 - accuracy: 0.7542 - val_loss: 0.6836 - val_accuracy: 0.7697 Epoch 4/10 105/105 [==============================] - 9s 87ms/step - loss: 0.5617 - accuracy: 0.8290 - val_loss: 0.6230 - val_accuracy: 0.7841 Epoch 5/10 105/105 [==============================] - 9s 89ms/step - loss: 0.4473 - accuracy: 0.8661 - val_loss: 0.6088 - val_accuracy: 0.7849 Epoch 6/10 105/105 [==============================] - 9s 86ms/step - loss: 0.3485 - accuracy: 0.9009 - val_loss: 0.5994 - val_accuracy: 0.7912 Epoch 7/10 105/105 [==============================] - 9s 83ms/step - loss: 0.2902 - accuracy: 0.9159 - val_loss: 0.6041 - val_accuracy: 0.7896 Epoch 8/10 105/105 [==============================] - 9s 84ms/step - loss: 0.2458 - accuracy: 0.9314 - val_loss: 0.6211 - val_accuracy: 0.7841 Epoch 00008: early stopping Epoch 00008: early stopping Epoch 1/10 105/105 [==============================] - 15s 133ms/step - loss: 1.6483 - accuracy: 0.4271 - val_loss: 1.2063 - val_accuracy: 0.6375 Epoch 2/10 105/105 [==============================] - 14s 129ms/step - loss: 1.0148 - accuracy: 0.6747 - val_loss: 0.7809 - val_accuracy: 0.7546 Epoch 3/10 105/105 [==============================] - 14s 128ms/step - loss: 0.6918 - accuracy: 0.7849 - val_loss: 0.6452 - val_accuracy: 0.7896 Epoch 4/10 105/105 [==============================] - 13s 128ms/step - loss: 0.5181 - accuracy: 0.8461 - val_loss: 0.6021 - val_accuracy: 0.7928 Epoch 5/10 105/105 [==============================] - 13s 126ms/step - loss: 0.4045 - accuracy: 0.8848 - val_loss: 0.5882 - val_accuracy: 0.7992 Epoch 6/10 105/105 [==============================] - 13s 128ms/step - loss: 0.3199 - accuracy: 0.9147 - val_loss: 0.5921 - val_accuracy: 0.7944 Epoch 7/10 105/105 [==============================] - 14s 132ms/step - loss: 0.2712 - accuracy: 0.9241 - val_loss: 0.5959 - val_accuracy: 0.7896 Epoch 00007: early stopping Epoch 00007: early stopping Epoch 1/10 105/105 [==============================] - 17s 150ms/step - loss: 1.6650 - accuracy: 0.4463 - val_loss: 1.1786 - val_accuracy: 0.6454 Epoch 2/10 105/105 [==============================] - 16s 149ms/step - loss: 0.9733 - accuracy: 0.6883 - val_loss: 0.7480 - val_accuracy: 0.7657 Epoch 3/10 105/105 [==============================] - 16s 154ms/step - loss: 0.6646 - accuracy: 0.8025 - val_loss: 0.6339 - val_accuracy: 0.7833 Epoch 4/10 105/105 [==============================] - 19s 178ms/step - loss: 0.4789 - accuracy: 0.8641 - val_loss: 0.5940 - val_accuracy: 0.7984 Epoch 5/10 105/105 [==============================] - 15s 147ms/step - loss: 0.3676 - accuracy: 0.8942 - val_loss: 0.5822 - val_accuracy: 0.7976 Epoch 6/10 105/105 [==============================] - 16s 154ms/step - loss: 0.2917 - accuracy: 0.9191 - val_loss: 0.5828 - val_accuracy: 0.7952 Epoch 00006: early stopping Epoch 1/10 105/105 [==============================] - 16s 147ms/step - loss: 1.6246 - accuracy: 0.4331 - val_loss: 1.1557 - val_accuracy: 0.6080 Epoch 2/10 105/105 [==============================] - 19s 182ms/step - loss: 0.9654 - accuracy: 0.6691 - val_loss: 0.7538 - val_accuracy: 0.7514 Epoch 3/10 105/105 [==============================] - 16s 153ms/step - loss: 0.6566 - accuracy: 0.7925 - val_loss: 0.6379 - val_accuracy: 0.7833 Epoch 4/10 105/105 [==============================] - 17s 158ms/step - loss: 0.4630 - accuracy: 0.8601 - val_loss: 0.5999 - val_accuracy: 0.7936 Epoch 5/10 105/105 [==============================] - 16s 152ms/step - loss: 0.3605 - accuracy: 0.9007 - val_loss: 0.5941 - val_accuracy: 0.7920 Epoch 6/10 105/105 [==============================] - 16s 157ms/step - loss: 0.2887 - accuracy: 0.9181 - val_loss: 0.5996 - val_accuracy: 0.7912 Epoch 00006: early stopping
params.sort_values(by=["accuracy"], ascending=False, inplace=True)
params.head()
| params | accuracy | |
|---|---|---|
| 6 | {'activation': 'relu', 'batch_size': 48, 'embe... | 0.775510 |
| 3 | {'activation': 'relu', 'batch_size': 32, 'embe... | 0.771684 |
| 21 | {'activation': 'tanh', 'batch_size': 48, 'embe... | 0.771046 |
| 15 | {'activation': 'tanh', 'batch_size': 32, 'embe... | 0.769770 |
| 22 | {'activation': 'tanh', 'batch_size': 48, 'embe... | 0.769133 |
# as the tuning takes about an hour, let us store our tuning results in a csv
# in case we want to use the results again without spending the 1 hour again
params.to_csv("output/hyper_paramerter_tuning_results.csv", index=False)
best_params = params.iloc[0]["params"]
model = compile_model(best_params, x_train)
history = model.fit(x_train, y_train, shuffle=True , epochs=10, batch_size=best_params["batch_size"], validation_split=0.2,
callbacks=[reduce_lr_accuracy, reduce_lr_loss, early_stop_loss, early_stop_accuracy])
predicted = model.predict(x_test)
print(classification_report(np.argmax(y_test, axis=1), np.argmax(predicted, axis=1),
target_names=df['label'].unique()))
Epoch 1/10
105/105 [==============================] - 9s 79ms/step - loss: 1.8111 - accuracy: 0.3779 - val_loss: 1.5008 - val_accuracy: 0.6231
Epoch 2/10
105/105 [==============================] - 8s 75ms/step - loss: 1.2265 - accuracy: 0.6261 - val_loss: 0.9173 - val_accuracy: 0.7331
Epoch 3/10
105/105 [==============================] - 8s 73ms/step - loss: 0.8960 - accuracy: 0.7120 - val_loss: 0.7391 - val_accuracy: 0.7721
Epoch 4/10
105/105 [==============================] - 8s 76ms/step - loss: 0.7240 - accuracy: 0.7764 - val_loss: 0.6652 - val_accuracy: 0.7833
Epoch 5/10
105/105 [==============================] - 8s 76ms/step - loss: 0.6024 - accuracy: 0.8180 - val_loss: 0.6365 - val_accuracy: 0.7904
Epoch 6/10
105/105 [==============================] - 8s 81ms/step - loss: 0.5195 - accuracy: 0.8533 - val_loss: 0.6137 - val_accuracy: 0.7968
Epoch 7/10
105/105 [==============================] - 8s 81ms/step - loss: 0.4483 - accuracy: 0.8740 - val_loss: 0.6113 - val_accuracy: 0.7944
Epoch 8/10
105/105 [==============================] - 9s 89ms/step - loss: 0.3954 - accuracy: 0.8888 - val_loss: 0.6166 - val_accuracy: 0.7936
Epoch 00008: early stopping
precision recall f1-score support
7 0.87 0.81 0.84 242
2 0.84 0.79 0.81 227
5 0.82 0.96 0.89 228
1 0.70 0.74 0.72 212
6 0.73 0.63 0.68 207
3 0.67 0.68 0.68 238
4 0.82 0.82 0.82 214
accuracy 0.78 1568
macro avg 0.78 0.78 0.78 1568
weighted avg 0.78 0.78 0.78 1568
# now we will use the best parameters to train our model
# to predict on the final test set we will fit our model on the entire training set (i.e, comments and labels)
comments = tokenizer.texts_to_sequences(comments)
comments = pad_sequences(comments, maxlen=1000, padding='post', truncating='post')
labels = encode.transform(labels)
labels = labels.toarray()
best_params = params.iloc[0]["params"]
model = compile_model(best_params, comments)
history = model.fit(comments, labels, shuffle=True , epochs=10, batch_size=best_params["batch_size"], validation_split=0.2,
callbacks=[reduce_lr_accuracy, reduce_lr_loss, early_stop_loss, early_stop_accuracy])
Epoch 1/10 131/131 [==============================] - 10s 71ms/step - loss: 1.7278 - accuracy: 0.4247 - val_loss: 1.3289 - val_accuracy: 0.5727 Epoch 2/10 131/131 [==============================] - 11s 81ms/step - loss: 1.1115 - accuracy: 0.6437 - val_loss: 0.8799 - val_accuracy: 0.7092 Epoch 3/10 131/131 [==============================] - 10s 73ms/step - loss: 0.8088 - accuracy: 0.7498 - val_loss: 0.7570 - val_accuracy: 0.7417 Epoch 4/10 131/131 [==============================] - 11s 84ms/step - loss: 0.6518 - accuracy: 0.7982 - val_loss: 0.7141 - val_accuracy: 0.7538 Epoch 5/10 131/131 [==============================] - 10s 77ms/step - loss: 0.5565 - accuracy: 0.8324 - val_loss: 0.7038 - val_accuracy: 0.7577 Epoch 6/10 131/131 [==============================] - 10s 77ms/step - loss: 0.4984 - accuracy: 0.8551 - val_loss: 0.7008 - val_accuracy: 0.7577 Epoch 7/10 131/131 [==============================] - 11s 81ms/step - loss: 0.4227 - accuracy: 0.8756 - val_loss: 0.7070 - val_accuracy: 0.7596 Epoch 8/10 131/131 [==============================] - 13s 96ms/step - loss: 0.3884 - accuracy: 0.8897 - val_loss: 0.7210 - val_accuracy: 0.7557 Epoch 00008: early stopping
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(history.history["accuracy"]))), y=history.history["accuracy"], name="accuracy"))
fig.add_trace(go.Scatter(x=list(range(len(history.history["val_accuracy"]))), y=history.history["val_accuracy"], name="val_accuracy"))
fig.update_layout(hovermode="x")
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(history.history["loss"]))), y=history.history["loss"], name="loss"))
fig.add_trace(go.Scatter(x=list(range(len(history.history["val_loss"]))), y=history.history["val_loss"], name="val_loss"))
fig.update_layout(hovermode="x")
fig.show()
We can see that our model stopped training when val_accuracy and val_loss started to plateau.
# saving this model as well
model.save("output/model.h5")
test_labels = []
test_df = pd.read_csv("data/test.csv")
comments = test_df["text"]
for comment in comments:
comment = [clean(comment)]
seq = tokenizer.texts_to_sequences(comment)
padded = pad_sequences(seq, maxlen=1000, padding='post', truncating='post')
pred = model.predict(padded)
predicted_label = encode.inverse_transform(pred)
test_labels.append(int(predicted_label[0][0]))
submission = pd.DataFrame(columns=["Label"], data=test_labels)
submission.head()
submission.to_csv("output/submission.csv", index=False)